{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "l0Y7_lgN4jzM"
      },
      "source": [
        "# **Bioinformatics Project - Computational Drug Discovery [Part 2] \n",
        "\n",
        "Chanin Nantasenamat\n",
        "\n",
        "[*'Data Professor' YouTube channel*](http://youtube.com/dataprofessor)\n",
        "\n",
        "---"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "o-4IOizard4P"
      },
      "source": [
        "## **Install conda and rdkit**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "H0mjQ2PcrSe5",
        "outputId": "3179c4ce-7d6c-45c5-e0a2-0d798137b93c"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2022-01-09 10:35:50--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
            "Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.131.3, 104.16.130.3, 2606:4700::6810:8303, ...\n",
            "Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.131.3|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 85055499 (81M) [application/x-sh]\n",
            "Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’\n",
            "\n",
            "Miniconda3-py37_4.8 100%[===================>]  81.12M   157MB/s    in 0.5s    \n",
            "\n",
            "2022-01-09 10:35:51 (157 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’ saved [85055499/85055499]\n",
            "\n",
            "PREFIX=/usr/local\n",
            "Unpacking payload ...\n",
            "Collecting package metadata (current_repodata.json): - \b\b\\ \b\b| \b\bdone\n",
            "Solving environment: - \b\b\\ \b\bdone\n",
            "\n",
            "## Package Plan ##\n",
            "\n",
            "  environment location: /usr/local\n",
            "\n",
            "  added / updated specs:\n",
            "    - _libgcc_mutex==0.1=main\n",
            "    - asn1crypto==1.3.0=py37_0\n",
            "    - ca-certificates==2020.1.1=0\n",
            "    - certifi==2019.11.28=py37_0\n",
            "    - cffi==1.14.0=py37h2e261b9_0\n",
            "    - chardet==3.0.4=py37_1003\n",
            "    - conda-package-handling==1.6.0=py37h7b6447c_0\n",
            "    - conda==4.8.2=py37_0\n",
            "    - cryptography==2.8=py37h1ba5d50_0\n",
            "    - idna==2.8=py37_0\n",
            "    - ld_impl_linux-64==2.33.1=h53a641e_7\n",
            "    - libedit==3.1.20181209=hc058e9b_0\n",
            "    - libffi==3.2.1=hd88cf55_4\n",
            "    - libgcc-ng==9.1.0=hdf63c60_0\n",
            "    - libstdcxx-ng==9.1.0=hdf63c60_0\n",
            "    - ncurses==6.2=he6710b0_0\n",
            "    - openssl==1.1.1d=h7b6447c_4\n",
            "    - pip==20.0.2=py37_1\n",
            "    - pycosat==0.6.3=py37h7b6447c_0\n",
            "    - pycparser==2.19=py37_0\n",
            "    - pyopenssl==19.1.0=py37_0\n",
            "    - pysocks==1.7.1=py37_0\n",
            "    - python==3.7.6=h0371630_2\n",
            "    - readline==7.0=h7b6447c_5\n",
            "    - requests==2.22.0=py37_1\n",
            "    - ruamel_yaml==0.15.87=py37h7b6447c_0\n",
            "    - setuptools==45.2.0=py37_0\n",
            "    - six==1.14.0=py37_0\n",
            "    - sqlite==3.31.1=h7b6447c_0\n",
            "    - tk==8.6.8=hbc83047_0\n",
            "    - tqdm==4.42.1=py_0\n",
            "    - urllib3==1.25.8=py37_0\n",
            "    - wheel==0.34.2=py37_0\n",
            "    - xz==5.2.4=h14c3975_4\n",
            "    - yaml==0.1.7=had09818_2\n",
            "    - zlib==1.2.11=h7b6447c_3\n",
            "\n",
            "\n",
            "The following NEW packages will be INSTALLED:\n",
            "\n",
            "  _libgcc_mutex      pkgs/main/linux-64::_libgcc_mutex-0.1-main\n",
            "  asn1crypto         pkgs/main/linux-64::asn1crypto-1.3.0-py37_0\n",
            "  ca-certificates    pkgs/main/linux-64::ca-certificates-2020.1.1-0\n",
            "  certifi            pkgs/main/linux-64::certifi-2019.11.28-py37_0\n",
            "  cffi               pkgs/main/linux-64::cffi-1.14.0-py37h2e261b9_0\n",
            "  chardet            pkgs/main/linux-64::chardet-3.0.4-py37_1003\n",
            "  conda              pkgs/main/linux-64::conda-4.8.2-py37_0\n",
            "  conda-package-han~ pkgs/main/linux-64::conda-package-handling-1.6.0-py37h7b6447c_0\n",
            "  cryptography       pkgs/main/linux-64::cryptography-2.8-py37h1ba5d50_0\n",
            "  idna               pkgs/main/linux-64::idna-2.8-py37_0\n",
            "  ld_impl_linux-64   pkgs/main/linux-64::ld_impl_linux-64-2.33.1-h53a641e_7\n",
            "  libedit            pkgs/main/linux-64::libedit-3.1.20181209-hc058e9b_0\n",
            "  libffi             pkgs/main/linux-64::libffi-3.2.1-hd88cf55_4\n",
            "  libgcc-ng          pkgs/main/linux-64::libgcc-ng-9.1.0-hdf63c60_0\n",
            "  libstdcxx-ng       pkgs/main/linux-64::libstdcxx-ng-9.1.0-hdf63c60_0\n",
            "  ncurses            pkgs/main/linux-64::ncurses-6.2-he6710b0_0\n",
            "  openssl            pkgs/main/linux-64::openssl-1.1.1d-h7b6447c_4\n",
            "  pip                pkgs/main/linux-64::pip-20.0.2-py37_1\n",
            "  pycosat            pkgs/main/linux-64::pycosat-0.6.3-py37h7b6447c_0\n",
            "  pycparser          pkgs/main/linux-64::pycparser-2.19-py37_0\n",
            "  pyopenssl          pkgs/main/linux-64::pyopenssl-19.1.0-py37_0\n",
            "  pysocks            pkgs/main/linux-64::pysocks-1.7.1-py37_0\n",
            "  python             pkgs/main/linux-64::python-3.7.6-h0371630_2\n",
            "  readline           pkgs/main/linux-64::readline-7.0-h7b6447c_5\n",
            "  requests           pkgs/main/linux-64::requests-2.22.0-py37_1\n",
            "  ruamel_yaml        pkgs/main/linux-64::ruamel_yaml-0.15.87-py37h7b6447c_0\n",
            "  setuptools         pkgs/main/linux-64::setuptools-45.2.0-py37_0\n",
            "  six                pkgs/main/linux-64::six-1.14.0-py37_0\n",
            "  sqlite             pkgs/main/linux-64::sqlite-3.31.1-h7b6447c_0\n",
            "  tk                 pkgs/main/linux-64::tk-8.6.8-hbc83047_0\n",
            "  tqdm               pkgs/main/noarch::tqdm-4.42.1-py_0\n",
            "  urllib3            pkgs/main/linux-64::urllib3-1.25.8-py37_0\n",
            "  wheel              pkgs/main/linux-64::wheel-0.34.2-py37_0\n",
            "  xz                 pkgs/main/linux-64::xz-5.2.4-h14c3975_4\n",
            "  yaml               pkgs/main/linux-64::yaml-0.1.7-had09818_2\n",
            "  zlib               pkgs/main/linux-64::zlib-1.2.11-h7b6447c_3\n",
            "\n",
            "\n",
            "Preparing transaction: / \b\b- \b\b\\ \b\b| \b\bdone\n",
            "Executing transaction: - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bdone\n",
            "installation finished.\n",
            "WARNING:\n",
            "    You currently have a PYTHONPATH environment variable set. This may cause\n",
            "    unexpected behavior when running the Python interpreter in Miniconda3.\n",
            "    For best results, please verify that your PYTHONPATH only points to\n",
            "    directories of packages that are compatible with the Python interpreter\n",
            "    in Miniconda3: /usr/local\n",
            "Collecting package metadata (current_repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n",
            "Solving environment: | \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bdone\n",
            "\n",
            "## Package Plan ##\n",
            "\n",
            "  environment location: /usr/local\n",
            "\n",
            "  added / updated specs:\n",
            "    - rdkit\n",
            "\n",
            "\n",
            "The following packages will be downloaded:\n",
            "\n",
            "    package                    |            build\n",
            "    ---------------------------|-----------------\n",
            "    _openmp_mutex-4.5          |            1_gnu          22 KB\n",
            "    blas-1.0                   |              mkl           6 KB\n",
            "    bottleneck-1.3.2           |   py37heb32a55_1         125 KB\n",
            "    bzip2-1.0.8                |       h7b6447c_0          78 KB\n",
            "    ca-certificates-2021.10.26 |       h06a4308_2         115 KB\n",
            "    cairo-1.16.0               |       hf32fb01_1         1.0 MB\n",
            "    certifi-2021.10.8          |   py37h06a4308_0         151 KB\n",
            "    conda-4.11.0               |   py37h06a4308_0        14.4 MB\n",
            "    fontconfig-2.13.1          |       h6c09931_0         250 KB\n",
            "    freetype-2.11.0            |       h70c0345_0         618 KB\n",
            "    giflib-5.2.1               |       h7b6447c_0          78 KB\n",
            "    glib-2.69.1                |       h5202010_0         1.7 MB\n",
            "    icu-58.2                   |       he6710b0_3        10.5 MB\n",
            "    intel-openmp-2021.4.0      |    h06a4308_3561         4.2 MB\n",
            "    jpeg-9d                    |       h7f8727e_0         232 KB\n",
            "    lcms2-2.12                 |       h3be6417_0         312 KB\n",
            "    libboost-1.73.0            |      h3ff78a5_11        13.9 MB\n",
            "    libffi-3.3                 |       he6710b0_2          50 KB\n",
            "    libgcc-ng-9.3.0            |      h5101ec6_17         4.8 MB\n",
            "    libgomp-9.3.0              |      h5101ec6_17         311 KB\n",
            "    libpng-1.6.37              |       hbc83047_0         278 KB\n",
            "    libtiff-4.2.0              |       h85742a9_0         502 KB\n",
            "    libuuid-1.0.3              |       h7f8727e_2          17 KB\n",
            "    libwebp-1.2.0              |       h89dd481_0         493 KB\n",
            "    libwebp-base-1.2.0         |       h27cfd23_0         437 KB\n",
            "    libxcb-1.14                |       h7b6447c_0         505 KB\n",
            "    libxml2-2.9.12             |       h03d6c58_0         1.2 MB\n",
            "    lz4-c-1.9.3                |       h295c915_1         185 KB\n",
            "    mkl-2021.4.0               |     h06a4308_640       142.6 MB\n",
            "    mkl-service-2.4.0          |   py37h7f8727e_0          56 KB\n",
            "    mkl_fft-1.3.1              |   py37hd3c417c_0         172 KB\n",
            "    mkl_random-1.2.2           |   py37h51133e4_0         287 KB\n",
            "    numexpr-2.8.1              |   py37h6abb31d_0         123 KB\n",
            "    numpy-1.21.2               |   py37h20f2e39_0          23 KB\n",
            "    numpy-base-1.21.2          |   py37h79a1101_0         4.8 MB\n",
            "    olefile-0.46               |           py37_0          50 KB\n",
            "    openssl-1.1.1l             |       h7f8727e_0         2.5 MB\n",
            "    packaging-21.3             |     pyhd3eb1b0_0          36 KB\n",
            "    pandas-1.3.5               |   py37h8c16a72_0         9.3 MB\n",
            "    pcre-8.45                  |       h295c915_0         207 KB\n",
            "    pillow-8.4.0               |   py37h5aabda8_0         644 KB\n",
            "    pixman-0.40.0              |       h7f8727e_1         373 KB\n",
            "    py-boost-1.73.0            |  py37ha9443f7_11         204 KB\n",
            "    pyparsing-3.0.4            |     pyhd3eb1b0_0          81 KB\n",
            "    python-dateutil-2.8.2      |     pyhd3eb1b0_0         233 KB\n",
            "    pytz-2021.3                |     pyhd3eb1b0_0         171 KB\n",
            "    rdkit-2020.09.1.0          |   py37hd50e099_1        25.8 MB  rdkit\n",
            "    xz-5.2.5                   |       h7b6447c_0         341 KB\n",
            "    zstd-1.4.9                 |       haebb681_0         480 KB\n",
            "    ------------------------------------------------------------\n",
            "                                           Total:       244.8 MB\n",
            "\n",
            "The following NEW packages will be INSTALLED:\n",
            "\n",
            "  _openmp_mutex      pkgs/main/linux-64::_openmp_mutex-4.5-1_gnu\n",
            "  blas               pkgs/main/linux-64::blas-1.0-mkl\n",
            "  bottleneck         pkgs/main/linux-64::bottleneck-1.3.2-py37heb32a55_1\n",
            "  bzip2              pkgs/main/linux-64::bzip2-1.0.8-h7b6447c_0\n",
            "  cairo              pkgs/main/linux-64::cairo-1.16.0-hf32fb01_1\n",
            "  fontconfig         pkgs/main/linux-64::fontconfig-2.13.1-h6c09931_0\n",
            "  freetype           pkgs/main/linux-64::freetype-2.11.0-h70c0345_0\n",
            "  giflib             pkgs/main/linux-64::giflib-5.2.1-h7b6447c_0\n",
            "  glib               pkgs/main/linux-64::glib-2.69.1-h5202010_0\n",
            "  icu                pkgs/main/linux-64::icu-58.2-he6710b0_3\n",
            "  intel-openmp       pkgs/main/linux-64::intel-openmp-2021.4.0-h06a4308_3561\n",
            "  jpeg               pkgs/main/linux-64::jpeg-9d-h7f8727e_0\n",
            "  lcms2              pkgs/main/linux-64::lcms2-2.12-h3be6417_0\n",
            "  libboost           pkgs/main/linux-64::libboost-1.73.0-h3ff78a5_11\n",
            "  libgomp            pkgs/main/linux-64::libgomp-9.3.0-h5101ec6_17\n",
            "  libpng             pkgs/main/linux-64::libpng-1.6.37-hbc83047_0\n",
            "  libtiff            pkgs/main/linux-64::libtiff-4.2.0-h85742a9_0\n",
            "  libuuid            pkgs/main/linux-64::libuuid-1.0.3-h7f8727e_2\n",
            "  libwebp            pkgs/main/linux-64::libwebp-1.2.0-h89dd481_0\n",
            "  libwebp-base       pkgs/main/linux-64::libwebp-base-1.2.0-h27cfd23_0\n",
            "  libxcb             pkgs/main/linux-64::libxcb-1.14-h7b6447c_0\n",
            "  libxml2            pkgs/main/linux-64::libxml2-2.9.12-h03d6c58_0\n",
            "  lz4-c              pkgs/main/linux-64::lz4-c-1.9.3-h295c915_1\n",
            "  mkl                pkgs/main/linux-64::mkl-2021.4.0-h06a4308_640\n",
            "  mkl-service        pkgs/main/linux-64::mkl-service-2.4.0-py37h7f8727e_0\n",
            "  mkl_fft            pkgs/main/linux-64::mkl_fft-1.3.1-py37hd3c417c_0\n",
            "  mkl_random         pkgs/main/linux-64::mkl_random-1.2.2-py37h51133e4_0\n",
            "  numexpr            pkgs/main/linux-64::numexpr-2.8.1-py37h6abb31d_0\n",
            "  numpy              pkgs/main/linux-64::numpy-1.21.2-py37h20f2e39_0\n",
            "  numpy-base         pkgs/main/linux-64::numpy-base-1.21.2-py37h79a1101_0\n",
            "  olefile            pkgs/main/linux-64::olefile-0.46-py37_0\n",
            "  packaging          pkgs/main/noarch::packaging-21.3-pyhd3eb1b0_0\n",
            "  pandas             pkgs/main/linux-64::pandas-1.3.5-py37h8c16a72_0\n",
            "  pcre               pkgs/main/linux-64::pcre-8.45-h295c915_0\n",
            "  pillow             pkgs/main/linux-64::pillow-8.4.0-py37h5aabda8_0\n",
            "  pixman             pkgs/main/linux-64::pixman-0.40.0-h7f8727e_1\n",
            "  py-boost           pkgs/main/linux-64::py-boost-1.73.0-py37ha9443f7_11\n",
            "  pyparsing          pkgs/main/noarch::pyparsing-3.0.4-pyhd3eb1b0_0\n",
            "  python-dateutil    pkgs/main/noarch::python-dateutil-2.8.2-pyhd3eb1b0_0\n",
            "  pytz               pkgs/main/noarch::pytz-2021.3-pyhd3eb1b0_0\n",
            "  rdkit              rdkit/linux-64::rdkit-2020.09.1.0-py37hd50e099_1\n",
            "  zstd               pkgs/main/linux-64::zstd-1.4.9-haebb681_0\n",
            "\n",
            "The following packages will be UPDATED:\n",
            "\n",
            "  ca-certificates                                2020.1.1-0 --> 2021.10.26-h06a4308_2\n",
            "  certifi                                 2019.11.28-py37_0 --> 2021.10.8-py37h06a4308_0\n",
            "  conda                                        4.8.2-py37_0 --> 4.11.0-py37h06a4308_0\n",
            "  libffi                                   3.2.1-hd88cf55_4 --> 3.3-he6710b0_2\n",
            "  libgcc-ng                                9.1.0-hdf63c60_0 --> 9.3.0-h5101ec6_17\n",
            "  openssl                                 1.1.1d-h7b6447c_4 --> 1.1.1l-h7f8727e_0\n",
            "  xz                                       5.2.4-h14c3975_4 --> 5.2.5-h7b6447c_0\n",
            "\n",
            "\n",
            "\n",
            "Downloading and Extracting Packages\n",
            "mkl_random-1.2.2     | 287 KB    | : 100% 1.0/1 [00:00<00:00,  7.47it/s]\n",
            "mkl-2021.4.0         | 142.6 MB  | : 100% 1.0/1 [00:06<00:00,  6.34s/it]               \n",
            "libgcc-ng-9.3.0      | 4.8 MB    | : 100% 1.0/1 [00:00<00:00,  4.05it/s]\n",
            "cairo-1.16.0         | 1.0 MB    | : 100% 1.0/1 [00:00<00:00,  8.53it/s]\n",
            "libffi-3.3           | 50 KB     | : 100% 1.0/1 [00:00<00:00, 14.03it/s]\n",
            "openssl-1.1.1l       | 2.5 MB    | : 100% 1.0/1 [00:00<00:00,  3.79it/s]\n",
            "packaging-21.3       | 36 KB     | : 100% 1.0/1 [00:00<00:00, 12.84it/s]\n",
            "conda-4.11.0         | 14.4 MB   | : 100% 1.0/1 [00:00<00:00,  1.80it/s]               \n",
            "numexpr-2.8.1        | 123 KB    | : 100% 1.0/1 [00:00<00:00, 14.29it/s]\n",
            "fontconfig-2.13.1    | 250 KB    | : 100% 1.0/1 [00:00<00:00, 11.40it/s]\n",
            "zstd-1.4.9           | 480 KB    | : 100% 1.0/1 [00:00<00:00, 11.98it/s]\n",
            "libtiff-4.2.0        | 502 KB    | : 100% 1.0/1 [00:00<00:00, 11.15it/s]\n",
            "olefile-0.46         | 50 KB     | : 100% 1.0/1 [00:00<00:00, 16.87it/s]\n",
            "py-boost-1.73.0      | 204 KB    | : 100% 1.0/1 [00:00<00:00, 13.83it/s]\n",
            "ca-certificates-2021 | 115 KB    | : 100% 1.0/1 [00:00<00:00, 14.47it/s]\n",
            "pytz-2021.3          | 171 KB    | : 100% 1.0/1 [00:00<00:00,  8.51it/s]\n",
            "pillow-8.4.0         | 644 KB    | : 100% 1.0/1 [00:00<00:00,  9.36it/s]\n",
            "intel-openmp-2021.4. | 4.2 MB    | : 100% 1.0/1 [00:00<00:00,  3.77it/s]\n",
            "lz4-c-1.9.3          | 185 KB    | : 100% 1.0/1 [00:00<00:00, 12.77it/s]\n",
            "libboost-1.73.0      | 13.9 MB   | : 100% 1.0/1 [00:02<00:00,  1.38s/it]                \n",
            "libwebp-base-1.2.0   | 437 KB    | : 100% 1.0/1 [00:00<00:00, 12.74it/s]\n",
            "blas-1.0             | 6 KB      | : 100% 1.0/1 [00:00<00:00, 17.09it/s]\n",
            "pixman-0.40.0        | 373 KB    | : 100% 1.0/1 [00:00<00:00, 14.27it/s]\n",
            "libgomp-9.3.0        | 311 KB    | : 100% 1.0/1 [00:00<00:00, 12.94it/s]\n",
            "pyparsing-3.0.4      | 81 KB     | : 100% 1.0/1 [00:00<00:00, 15.07it/s]\n",
            "pcre-8.45            | 207 KB    | : 100% 1.0/1 [00:00<00:00, 15.24it/s]\n",
            "bottleneck-1.3.2     | 125 KB    | : 100% 1.0/1 [00:00<00:00, 13.13it/s]\n",
            "libxml2-2.9.12       | 1.2 MB    | : 100% 1.0/1 [00:00<00:00,  6.17it/s]\n",
            "mkl_fft-1.3.1        | 172 KB    | : 100% 1.0/1 [00:00<00:00, 13.80it/s]\n",
            "icu-58.2             | 10.5 MB   | : 100% 1.0/1 [00:00<00:00,  2.23it/s]               \n",
            "rdkit-2020.09.1.0    | 25.8 MB   | : 100% 1.0/1 [00:06<00:00,  6.13s/it]\n",
            "pandas-1.3.5         | 9.3 MB    | : 100% 1.0/1 [00:00<00:00,  1.80it/s]               \n",
            "freetype-2.11.0      | 618 KB    | : 100% 1.0/1 [00:00<00:00, 11.38it/s]\n",
            "glib-2.69.1          | 1.7 MB    | : 100% 1.0/1 [00:00<00:00,  6.34it/s]\n",
            "libpng-1.6.37        | 278 KB    | : 100% 1.0/1 [00:00<00:00, 14.12it/s]\n",
            "lcms2-2.12           | 312 KB    | : 100% 1.0/1 [00:00<00:00, 13.01it/s]\n",
            "_openmp_mutex-4.5    | 22 KB     | : 100% 1.0/1 [00:00<00:00, 31.96it/s]\n",
            "mkl-service-2.4.0    | 56 KB     | : 100% 1.0/1 [00:00<00:00, 12.00it/s]\n",
            "numpy-1.21.2         | 23 KB     | : 100% 1.0/1 [00:00<00:00, 10.60it/s]\n",
            "certifi-2021.10.8    | 151 KB    | : 100% 1.0/1 [00:00<00:00, 11.42it/s]\n",
            "libuuid-1.0.3        | 17 KB     | : 100% 1.0/1 [00:00<00:00, 15.63it/s]\n",
            "libxcb-1.14          | 505 KB    | : 100% 1.0/1 [00:00<00:00,  8.55it/s]\n",
            "xz-5.2.5             | 341 KB    | : 100% 1.0/1 [00:00<00:00, 12.58it/s]\n",
            "libwebp-1.2.0        | 493 KB    | : 100% 1.0/1 [00:00<00:00, 11.87it/s]\n",
            "jpeg-9d              | 232 KB    | : 100% 1.0/1 [00:00<00:00, 14.92it/s]\n",
            "python-dateutil-2.8. | 233 KB    | : 100% 1.0/1 [00:00<00:00, 14.36it/s]\n",
            "giflib-5.2.1         | 78 KB     | : 100% 1.0/1 [00:00<00:00, 12.06it/s]\n",
            "bzip2-1.0.8          | 78 KB     | : 100% 1.0/1 [00:00<00:00, 16.64it/s]\n",
            "numpy-base-1.21.2    | 4.8 MB    | : 100% 1.0/1 [00:00<00:00,  2.40it/s]\n",
            "Preparing transaction: - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bdone\n",
            "Verifying transaction: - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\bdone\n",
            "Executing transaction: \\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bdone\n"
          ]
        }
      ],
      "source": [
        "! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
        "! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
        "! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local\n",
        "! conda install -c rdkit rdkit -y\n",
        "import sys\n",
        "sys.path.append('/usr/local/lib/python3.7/site-packages/')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QmxXXFa4wTNG"
      },
      "source": [
        "## **Load bioactivity data**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "Fpu5C7HlwV9s"
      },
      "outputs": [],
      "source": [
        "import pandas as pd"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "id": "GCcE8J5XwjtB"
      },
      "outputs": [],
      "source": [
        "df = pd.read_csv('/content/sars-cov-2_bioactivity_data_preprocessed.csv')"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        },
        "id": "8pVAyPqmg9ym",
        "outputId": "28cf2f6f-a394-4bf9-c120-244b407107b9"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-c4037cb1-c6a8-4be0-9da2-4dc182c617f8\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>molecule_chembl_id</th>\n",
              "      <th>canonical_smiles</th>\n",
              "      <th>standard_value</th>\n",
              "      <th>bioactivity_class</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>CHEMBL3301610</td>\n",
              "      <td>CCN1CCN(Cc2ccc(Nc3ncc(F)c(-c4cc(F)c5nc(C)n(C(C...</td>\n",
              "      <td>6620.0</td>\n",
              "      <td>intermediate</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>CHEMBL682</td>\n",
              "      <td>CCN(CC)Cc1cc(Nc2ccnc3cc(Cl)ccc23)ccc1O</td>\n",
              "      <td>5150.0</td>\n",
              "      <td>intermediate</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>CHEMBL264241</td>\n",
              "      <td>CCCCCOc1ccc(-c2ccc(-c3ccc(C(=O)N[C@H]4C[C@@H](...</td>\n",
              "      <td>4640.0</td>\n",
              "      <td>intermediate</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>CHEMBL46740</td>\n",
              "      <td>Cc1c(-c2ccc(O)cc2)n(Cc2ccc(OCCN3CCCCCC3)cc2)c2...</td>\n",
              "      <td>3440.0</td>\n",
              "      <td>intermediate</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>CHEMBL504323</td>\n",
              "      <td>COc1cc2c3cc1Oc1c(OC)c(OC)cc4c1[C@@H](Cc1ccc(O)...</td>\n",
              "      <td>7870.0</td>\n",
              "      <td>intermediate</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>281</th>\n",
              "      <td>CHEMBL583194</td>\n",
              "      <td>c1cncc(CN2CCC(n3ncc4c(N5CCOCC5)nc(-c5ccc6[nH]c...</td>\n",
              "      <td>9430.0</td>\n",
              "      <td>intermediate</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>282</th>\n",
              "      <td>CHEMBL3967196</td>\n",
              "      <td>CCCCn1c(NC(=O)c2cccs2)c(C#N)c2nc3ccccc3nc21</td>\n",
              "      <td>19650.0</td>\n",
              "      <td>inactive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>283</th>\n",
              "      <td>CHEMBL601661</td>\n",
              "      <td>CNC(=O)Nc1ccc(-c2nc(N3CC4CCC(C3)O4)c3cnn(C4CCC...</td>\n",
              "      <td>21620.0</td>\n",
              "      <td>inactive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>284</th>\n",
              "      <td>CHEMBL178334</td>\n",
              "      <td>CC(C)N1CCN(c2ccc(C(=O)c3c(-c4ccc(O)cc4)sc4cc(O...</td>\n",
              "      <td>29870.0</td>\n",
              "      <td>inactive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>285</th>\n",
              "      <td>CHEMBL2178735</td>\n",
              "      <td>Nc1ccc(C(=O)Nc2cccc(-c3nc(N4CCOCC4)c4oc5ncccc5...</td>\n",
              "      <td>4730.0</td>\n",
              "      <td>intermediate</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>286 rows × 4 columns</p>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c4037cb1-c6a8-4be0-9da2-4dc182c617f8')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-c4037cb1-c6a8-4be0-9da2-4dc182c617f8 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-c4037cb1-c6a8-4be0-9da2-4dc182c617f8');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "    molecule_chembl_id  ... bioactivity_class\n",
              "0        CHEMBL3301610  ...      intermediate\n",
              "1            CHEMBL682  ...      intermediate\n",
              "2         CHEMBL264241  ...      intermediate\n",
              "3          CHEMBL46740  ...      intermediate\n",
              "4         CHEMBL504323  ...      intermediate\n",
              "..                 ...  ...               ...\n",
              "281       CHEMBL583194  ...      intermediate\n",
              "282      CHEMBL3967196  ...          inactive\n",
              "283       CHEMBL601661  ...          inactive\n",
              "284       CHEMBL178334  ...          inactive\n",
              "285      CHEMBL2178735  ...      intermediate\n",
              "\n",
              "[286 rows x 4 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YzN_S4Quro5S"
      },
      "source": [
        "## **Calculate Lipinski descriptors**\n",
        "Christopher Lipinski, a scientist at Pfizer, came up with a set of rule-of-thumb for evaluating the **druglikeness** of compounds. Such druglikeness is based on the Absorption, Distribution, Metabolism and Excretion (ADME) that is also known as the pharmacokinetic profile. Lipinski analyzed all orally active FDA-approved drugs in the formulation of what is to be known as the **Rule-of-Five** or **Lipinski's Rule**.\n",
        "\n",
        "The Lipinski's Rule stated the following:\n",
        "* Molecular weight < 500 Dalton\n",
        "* Octanol-water partition coefficient (LogP) < 5\n",
        "* Hydrogen bond donors < 5\n",
        "* Hydrogen bond acceptors < 10 "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9qn_eQcnxY7C"
      },
      "source": [
        "### **Import libraries**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "id": "CgBjIdT-rnRU"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from rdkit import Chem\n",
        "from rdkit.Chem import Descriptors, Lipinski"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "e0MLOedB6j96"
      },
      "source": [
        "### **Convert IC50 to pIC50**\n",
        "To allow **IC50** data to be more uniformly distributed, we will convert **IC50** to the negative logarithmic scale which is essentially **-log10(IC50)**.\n",
        "\n",
        "This custom function pIC50() will accept a DataFrame as input and will:\n",
        "* Take the IC50 values from the ``standard_value`` column and converts it from nM to M by multiplying the value by 10$^{-9}$\n",
        "* Take the molar value and apply -log10\n",
        "* Delete the ``standard_value`` column and create a new ``pIC50`` column"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "id": "UXMuFQoQ4pZF"
      },
      "outputs": [],
      "source": [
        "# https://github.com/chaninlab/estrogen-receptor-alpha-qsar/blob/master/02_ER_alpha_RO5.ipynb\n",
        "\n",
        "import numpy as np\n",
        "\n",
        "def pIC50(input):\n",
        "    pIC50 = []\n",
        "\n",
        "    for i in input['standard_value_norm']:\n",
        "        molar = i*(10**-9) # Converts nM to M\n",
        "        pIC50.append(-np.log10(molar))\n",
        "\n",
        "    input['pIC50'] = pIC50\n",
        "    x = input.drop('standard_value_norm', 1)\n",
        "        \n",
        "    return x"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WU5Fh1h2OaJJ"
      },
      "source": [
        "Point to note: Values greater than 100,000,000 will be fixed at 100,000,000 otherwise the negative logarithmic value will become negative."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "QuUTFUpcR1wU",
        "outputId": "124a01df-11d1-4d13-f129-c011db9d915d"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "count      286.000000\n",
              "mean      8260.443717\n",
              "std       9395.814899\n",
              "min          6.653000\n",
              "25%       2086.990000\n",
              "50%       5365.160000\n",
              "75%      11735.000000\n",
              "max      91201.080000\n",
              "Name: standard_value, dtype: float64"
            ]
          },
          "metadata": {},
          "execution_count": 10
        }
      ],
      "source": [
        "df.standard_value.describe()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "QyiJ0to5N6Z_",
        "outputId": "b3334a91-74ad-493a-8218-4449656f6250"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1.0"
            ]
          },
          "metadata": {},
          "execution_count": 11
        }
      ],
      "source": [
        "-np.log10( (10**-9)* 100000000 )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9S1aJkOYOP6K",
        "outputId": "dabaecd3-9598-4370-b0c8-74cf13e3c6f0"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "-1.0"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ],
      "source": [
        "-np.log10( (10**-9)* 10000000000 )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "iktHDDwtPDwl"
      },
      "outputs": [],
      "source": [
        "def norm_value(input):\n",
        "    norm = []\n",
        "\n",
        "    for i in input['standard_value']:\n",
        "        if i > 100000000:\n",
        "          i = 100000000\n",
        "        norm.append(i)\n",
        "\n",
        "    input['standard_value_norm'] = norm\n",
        "    x = input.drop('standard_value', 1)\n",
        "        \n",
        "    return x"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EkrTs7RfPsrH"
      },
      "source": [
        "We will first apply the norm_value() function so that the values in the standard_value column is normalized."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        },
        "id": "EX2Mj2-ZP1Rj",
        "outputId": "47b16502-2eda-4e51-e567-c7ad7e40ebe5"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-1ddff511-ca99-480b-a145-1804239f0f93\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>molecule_chembl_id</th>\n",
              "      <th>canonical_smiles</th>\n",
              "      <th>bioactivity_class</th>\n",
              "      <th>standard_value_norm</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>CHEMBL3301610</td>\n",
              "      <td>CCN1CCN(Cc2ccc(Nc3ncc(F)c(-c4cc(F)c5nc(C)n(C(C...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>6620.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>CHEMBL682</td>\n",
              "      <td>CCN(CC)Cc1cc(Nc2ccnc3cc(Cl)ccc23)ccc1O</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>5150.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>CHEMBL264241</td>\n",
              "      <td>CCCCCOc1ccc(-c2ccc(-c3ccc(C(=O)N[C@H]4C[C@@H](...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>4640.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>CHEMBL46740</td>\n",
              "      <td>Cc1c(-c2ccc(O)cc2)n(Cc2ccc(OCCN3CCCCCC3)cc2)c2...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>3440.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>CHEMBL504323</td>\n",
              "      <td>COc1cc2c3cc1Oc1c(OC)c(OC)cc4c1[C@@H](Cc1ccc(O)...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>7870.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>281</th>\n",
              "      <td>CHEMBL583194</td>\n",
              "      <td>c1cncc(CN2CCC(n3ncc4c(N5CCOCC5)nc(-c5ccc6[nH]c...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>9430.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>282</th>\n",
              "      <td>CHEMBL3967196</td>\n",
              "      <td>CCCCn1c(NC(=O)c2cccs2)c(C#N)c2nc3ccccc3nc21</td>\n",
              "      <td>inactive</td>\n",
              "      <td>19650.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>283</th>\n",
              "      <td>CHEMBL601661</td>\n",
              "      <td>CNC(=O)Nc1ccc(-c2nc(N3CC4CCC(C3)O4)c3cnn(C4CCC...</td>\n",
              "      <td>inactive</td>\n",
              "      <td>21620.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>284</th>\n",
              "      <td>CHEMBL178334</td>\n",
              "      <td>CC(C)N1CCN(c2ccc(C(=O)c3c(-c4ccc(O)cc4)sc4cc(O...</td>\n",
              "      <td>inactive</td>\n",
              "      <td>29870.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>285</th>\n",
              "      <td>CHEMBL2178735</td>\n",
              "      <td>Nc1ccc(C(=O)Nc2cccc(-c3nc(N4CCOCC4)c4oc5ncccc5...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>4730.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>286 rows × 4 columns</p>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-1ddff511-ca99-480b-a145-1804239f0f93')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-1ddff511-ca99-480b-a145-1804239f0f93 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-1ddff511-ca99-480b-a145-1804239f0f93');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "    molecule_chembl_id  ... standard_value_norm\n",
              "0        CHEMBL3301610  ...              6620.0\n",
              "1            CHEMBL682  ...              5150.0\n",
              "2         CHEMBL264241  ...              4640.0\n",
              "3          CHEMBL46740  ...              3440.0\n",
              "4         CHEMBL504323  ...              7870.0\n",
              "..                 ...  ...                 ...\n",
              "281       CHEMBL583194  ...              9430.0\n",
              "282      CHEMBL3967196  ...             19650.0\n",
              "283       CHEMBL601661  ...             21620.0\n",
              "284       CHEMBL178334  ...             29870.0\n",
              "285      CHEMBL2178735  ...              4730.0\n",
              "\n",
              "[286 rows x 4 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ],
      "source": [
        "df_norm = norm_value(df)\n",
        "df_norm"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hb1eKrIjRiH9",
        "outputId": "03255bca-2964-4e23-970b-cb66292ac691"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "count      286.000000\n",
              "mean      8260.443717\n",
              "std       9395.814899\n",
              "min          6.653000\n",
              "25%       2086.990000\n",
              "50%       5365.160000\n",
              "75%      11735.000000\n",
              "max      91201.080000\n",
              "Name: standard_value_norm, dtype: float64"
            ]
          },
          "metadata": {},
          "execution_count": 15
        }
      ],
      "source": [
        "df_norm.standard_value_norm.describe()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        },
        "id": "UDKZzmK57YnS",
        "outputId": "11eee07a-7dbd-47ff-b66e-a6c992366e32"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-fb587476-4742-4e27-b35c-e53e6e2abb72\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>molecule_chembl_id</th>\n",
              "      <th>canonical_smiles</th>\n",
              "      <th>bioactivity_class</th>\n",
              "      <th>pIC50</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>CHEMBL3301610</td>\n",
              "      <td>CCN1CCN(Cc2ccc(Nc3ncc(F)c(-c4cc(F)c5nc(C)n(C(C...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>5.179142</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>CHEMBL682</td>\n",
              "      <td>CCN(CC)Cc1cc(Nc2ccnc3cc(Cl)ccc23)ccc1O</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>5.288193</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>CHEMBL264241</td>\n",
              "      <td>CCCCCOc1ccc(-c2ccc(-c3ccc(C(=O)N[C@H]4C[C@@H](...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>5.333482</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>CHEMBL46740</td>\n",
              "      <td>Cc1c(-c2ccc(O)cc2)n(Cc2ccc(OCCN3CCCCCC3)cc2)c2...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>5.463442</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>CHEMBL504323</td>\n",
              "      <td>COc1cc2c3cc1Oc1c(OC)c(OC)cc4c1[C@@H](Cc1ccc(O)...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>5.104025</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>281</th>\n",
              "      <td>CHEMBL583194</td>\n",
              "      <td>c1cncc(CN2CCC(n3ncc4c(N5CCOCC5)nc(-c5ccc6[nH]c...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>5.025488</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>282</th>\n",
              "      <td>CHEMBL3967196</td>\n",
              "      <td>CCCCn1c(NC(=O)c2cccs2)c(C#N)c2nc3ccccc3nc21</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.706637</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>283</th>\n",
              "      <td>CHEMBL601661</td>\n",
              "      <td>CNC(=O)Nc1ccc(-c2nc(N3CC4CCC(C3)O4)c3cnn(C4CCC...</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.665144</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>284</th>\n",
              "      <td>CHEMBL178334</td>\n",
              "      <td>CC(C)N1CCN(c2ccc(C(=O)c3c(-c4ccc(O)cc4)sc4cc(O...</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.524765</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>285</th>\n",
              "      <td>CHEMBL2178735</td>\n",
              "      <td>Nc1ccc(C(=O)Nc2cccc(-c3nc(N4CCOCC4)c4oc5ncccc5...</td>\n",
              "      <td>intermediate</td>\n",
              "      <td>5.325139</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>286 rows × 4 columns</p>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-fb587476-4742-4e27-b35c-e53e6e2abb72')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-fb587476-4742-4e27-b35c-e53e6e2abb72 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-fb587476-4742-4e27-b35c-e53e6e2abb72');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "    molecule_chembl_id  ...     pIC50\n",
              "0        CHEMBL3301610  ...  5.179142\n",
              "1            CHEMBL682  ...  5.288193\n",
              "2         CHEMBL264241  ...  5.333482\n",
              "3          CHEMBL46740  ...  5.463442\n",
              "4         CHEMBL504323  ...  5.104025\n",
              "..                 ...  ...       ...\n",
              "281       CHEMBL583194  ...  5.025488\n",
              "282      CHEMBL3967196  ...  4.706637\n",
              "283       CHEMBL601661  ...  4.665144\n",
              "284       CHEMBL178334  ...  4.524765\n",
              "285      CHEMBL2178735  ...  5.325139\n",
              "\n",
              "[286 rows x 4 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 16
        }
      ],
      "source": [
        "df_final = pIC50(df_norm)\n",
        "df_final"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BoqY53udSTYC",
        "outputId": "6cd38d48-4825-4208-cd2f-d0adefda39a5"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "count    286.000000\n",
              "mean       5.402166\n",
              "std        0.663532\n",
              "min        4.040000\n",
              "25%        4.930520\n",
              "50%        5.270418\n",
              "75%        5.680522\n",
              "max        8.176982\n",
              "Name: pIC50, dtype: float64"
            ]
          },
          "metadata": {},
          "execution_count": 17
        }
      ],
      "source": [
        "df_final.pIC50.describe()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "05vHBWvqaQtb"
      },
      "source": [
        "### **Removing the 'intermediate' bioactivity class**\n",
        "Here, we will be removing the ``intermediate`` class from our data set."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        },
        "id": "HmrndhDW3c7Z",
        "outputId": "2557cef5-ba82-4cc7-d774-6ea702ce1750"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-f02cd2ce-476e-400e-a0e2-16845b56e9d9\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>molecule_chembl_id</th>\n",
              "      <th>canonical_smiles</th>\n",
              "      <th>bioactivity_class</th>\n",
              "      <th>pIC50</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>10</th>\n",
              "      <td>CHEMBL1751</td>\n",
              "      <td>C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...</td>\n",
              "      <td>active</td>\n",
              "      <td>6.721246</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>11</th>\n",
              "      <td>CHEMBL1526240</td>\n",
              "      <td>CC(C)=CCCC1(C)C=Cc2c(O)c3c(c(CC=C(C)C)c2O1)OC1...</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.873869</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>17</th>\n",
              "      <td>CHEMBL496</td>\n",
              "      <td>Oc1c(Cl)cc(Cl)c(Cl)c1Cc1c(O)c(Cl)cc(Cl)c1Cl</td>\n",
              "      <td>active</td>\n",
              "      <td>6.045757</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>28</th>\n",
              "      <td>CHEMBL1448</td>\n",
              "      <td>O=C(Nc1ccc([N+](=O)[O-])cc1Cl)c1cc(Cl)ccc1O</td>\n",
              "      <td>active</td>\n",
              "      <td>6.552842</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>34</th>\n",
              "      <td>CHEMBL1242</td>\n",
              "      <td>Nc1ccc(/N=N/c2ccccc2)c(N)n1</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.552842</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>278</th>\n",
              "      <td>CHEMBL3913746</td>\n",
              "      <td>CC(C)(O)CNc1ncc(-c2ccnc(Nc3ccnc(Cl)c3)n2)c(CC2...</td>\n",
              "      <td>active</td>\n",
              "      <td>6.397940</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>279</th>\n",
              "      <td>CHEMBL1254577</td>\n",
              "      <td>O=C(NCCN1CCC2(CC1)C(=O)NCN2c1cccc(F)c1)c1ccc2c...</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.934047</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>282</th>\n",
              "      <td>CHEMBL3967196</td>\n",
              "      <td>CCCCn1c(NC(=O)c2cccs2)c(C#N)c2nc3ccccc3nc21</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.706637</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>283</th>\n",
              "      <td>CHEMBL601661</td>\n",
              "      <td>CNC(=O)Nc1ccc(-c2nc(N3CC4CCC(C3)O4)c3cnn(C4CCC...</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.665144</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>284</th>\n",
              "      <td>CHEMBL178334</td>\n",
              "      <td>CC(C)N1CCN(c2ccc(C(=O)c3c(-c4ccc(O)cc4)sc4cc(O...</td>\n",
              "      <td>inactive</td>\n",
              "      <td>4.524765</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>139 rows × 4 columns</p>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f02cd2ce-476e-400e-a0e2-16845b56e9d9')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-f02cd2ce-476e-400e-a0e2-16845b56e9d9 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-f02cd2ce-476e-400e-a0e2-16845b56e9d9');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "    molecule_chembl_id  ...     pIC50\n",
              "10          CHEMBL1751  ...  6.721246\n",
              "11       CHEMBL1526240  ...  4.873869\n",
              "17           CHEMBL496  ...  6.045757\n",
              "28          CHEMBL1448  ...  6.552842\n",
              "34          CHEMBL1242  ...  4.552842\n",
              "..                 ...  ...       ...\n",
              "278      CHEMBL3913746  ...  6.397940\n",
              "279      CHEMBL1254577  ...  4.934047\n",
              "282      CHEMBL3967196  ...  4.706637\n",
              "283       CHEMBL601661  ...  4.665144\n",
              "284       CHEMBL178334  ...  4.524765\n",
              "\n",
              "[139 rows x 4 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 18
        }
      ],
      "source": [
        "df_2class = df_final[df_final.bioactivity_class != 'intermediate']\n",
        "df_2class"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "id": "S1dHaoFj19V2"
      },
      "outputs": [],
      "source": [
        "df_2class.to_csv('sars-cov-2_bioactivity_data_processed_2classes_pIC50.csv')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p9vA4-hQQ8sA"
      },
      "source": [
        "---"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LMWOG2UIXEg-"
      },
      "outputs": [],
      "source": [
        ""
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "name": "2_build.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.5"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}